In [1]:
Copied!
import matplotlib.pyplot as plt
import os
import missingno as msno
import numpy as np
import seaborn as sns
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import os
import missingno as msno
import numpy as np
import seaborn as sns
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
Merging two datasets¶
In [2]:
Copied!
import json
js15 = json.load(open('dataset/socrata_metadata_2015-building-energy-benchmarking.json', mode='r'))
js16 = json.load(open('dataset/socrata_metadata_2016-building-energy-benchmarking.json', mode='r'))
import json
js15 = json.load(open('dataset/socrata_metadata_2015-building-energy-benchmarking.json', mode='r'))
js16 = json.load(open('dataset/socrata_metadata_2016-building-energy-benchmarking.json', mode='r'))
In [3]:
Copied!
df15 = pd.read_csv('dataset/2015-building-energy-benchmarking.csv')
df16 = pd.read_csv('dataset/2016-building-energy-benchmarking.csv')
df15 = pd.read_csv('dataset/2015-building-energy-benchmarking.csv')
df16 = pd.read_csv('dataset/2016-building-energy-benchmarking.csv')
In [4]:
Copied!
print(df15.shape)
print(df15.columns)
print(df15.shape)
print(df15.columns)
(3340, 47)
Index(['OSEBuildingID', 'DataYear', 'BuildingType', 'PrimaryPropertyType',
'PropertyName', 'TaxParcelIdentificationNumber', 'Location',
'CouncilDistrictCode', 'Neighborhood', 'YearBuilt', 'NumberofBuildings',
'NumberofFloors', 'PropertyGFATotal', 'PropertyGFAParking',
'PropertyGFABuilding(s)', 'ListOfAllPropertyUseTypes',
'LargestPropertyUseType', 'LargestPropertyUseTypeGFA',
'SecondLargestPropertyUseType', 'SecondLargestPropertyUseTypeGFA',
'ThirdLargestPropertyUseType', 'ThirdLargestPropertyUseTypeGFA',
'YearsENERGYSTARCertified', 'ENERGYSTARScore', 'SiteEUI(kBtu/sf)',
'SiteEUIWN(kBtu/sf)', 'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)',
'SiteEnergyUse(kBtu)', 'SiteEnergyUseWN(kBtu)', 'SteamUse(kBtu)',
'Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)',
'NaturalGas(kBtu)', 'OtherFuelUse(kBtu)',
'GHGEmissions(MetricTonsCO2e)', 'GHGEmissionsIntensity(kgCO2e/ft2)',
'DefaultData', 'Comment', 'ComplianceStatus', 'Outlier',
'2010 Census Tracts',
'Seattle Police Department Micro Community Policing Plan Areas',
'City Council Districts', 'SPD Beats', 'Zip Codes'],
dtype='object')
In [5]:
Copied!
print(df16.shape)
print(df16.columns)
print(df16.shape)
print(df16.columns)
(3376, 46)
Index(['OSEBuildingID', 'DataYear', 'BuildingType', 'PrimaryPropertyType',
'PropertyName', 'Address', 'City', 'State', 'ZipCode',
'TaxParcelIdentificationNumber', 'CouncilDistrictCode', 'Neighborhood',
'Latitude', 'Longitude', 'YearBuilt', 'NumberofBuildings',
'NumberofFloors', 'PropertyGFATotal', 'PropertyGFAParking',
'PropertyGFABuilding(s)', 'ListOfAllPropertyUseTypes',
'LargestPropertyUseType', 'LargestPropertyUseTypeGFA',
'SecondLargestPropertyUseType', 'SecondLargestPropertyUseTypeGFA',
'ThirdLargestPropertyUseType', 'ThirdLargestPropertyUseTypeGFA',
'YearsENERGYSTARCertified', 'ENERGYSTARScore', 'SiteEUI(kBtu/sf)',
'SiteEUIWN(kBtu/sf)', 'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)',
'SiteEnergyUse(kBtu)', 'SiteEnergyUseWN(kBtu)', 'SteamUse(kBtu)',
'Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)',
'NaturalGas(kBtu)', 'DefaultData', 'Comments', 'ComplianceStatus',
'Outlier', 'TotalGHGEmissions', 'GHGEmissionsIntensity'],
dtype='object')
In [6]:
Copied!
# montrer les differences de colonnes entre les dataset
contained16in15 = [col in df16.columns for col in df15.columns ]
extra15 = df15.columns[np.invert(contained16in15)]
print(extra15)
# describe to what corresponds the extra columns
for col in extra15:
print(col, ' : ', next(item for item in js15['columns'] if item['name'] == col).get('description'))
# montrer les differences de colonnes entre les dataset
contained16in15 = [col in df16.columns for col in df15.columns ]
extra15 = df15.columns[np.invert(contained16in15)]
print(extra15)
# describe to what corresponds the extra columns
for col in extra15:
print(col, ' : ', next(item for item in js15['columns'] if item['name'] == col).get('description'))
Index(['Location', 'OtherFuelUse(kBtu)', 'GHGEmissions(MetricTonsCO2e)',
'GHGEmissionsIntensity(kgCO2e/ft2)', 'Comment', '2010 Census Tracts',
'Seattle Police Department Micro Community Policing Plan Areas',
'City Council Districts', 'SPD Beats', 'Zip Codes'],
dtype='object')
Location : None
OtherFuelUse(kBtu) : None
GHGEmissions(MetricTonsCO2e) : The total amount of greenhouse gas emissions, including carbon dioxide, methane, and nitrous oxide gases released into the atmosphere as a result of energy consumption at the property, measured in metric tons of carbon dioxide equivalent. This calculation uses a GHG emissions factor from Seattle CIty Light's portfolio of generating resources. This uses Seattle City Light's 2015 emissions factor of 52.44 lbs CO2e/MWh. Enwave steam factor = 170.17 lbs CO2e/MMBtu. Gas factor sourced from EPA Portfolio Manager = 53.11 kg CO2e/MBtu.
GHGEmissionsIntensity(kgCO2e/ft2) :
Total Greenhouse Gas Emissions divided by property's gross floor area, measured in kilograms of carbon dioxide equivalent per square foot. This calculation uses a GHG emissions factor from Seattle City Light's portfolio of generating resources
Comment : None
2010 Census Tracts : None
Seattle Police Department Micro Community Policing Plan Areas : None
City Council Districts : None
SPD Beats : None
Zip Codes : None
In [7]:
Copied!
contained15in16 = [col in df15.columns for col in df16.columns ]
extra16 = df16.columns[np.invert(contained15in16)]
print(extra16)
# describe to what corresponds the extra columns
for col in extra16:
print(col, ' : ', next(item for item in js16['columns'] if item['name'] == col).get('description'))
contained15in16 = [col in df15.columns for col in df16.columns ]
extra16 = df16.columns[np.invert(contained15in16)]
print(extra16)
# describe to what corresponds the extra columns
for col in extra16:
print(col, ' : ', next(item for item in js16['columns'] if item['name'] == col).get('description'))
Index(['Address', 'City', 'State', 'ZipCode', 'Latitude', 'Longitude',
'Comments', 'TotalGHGEmissions', 'GHGEmissionsIntensity'],
dtype='object')
Address : Property street address
City : Property city
State : Property state
ZipCode : Property zip
Latitude : Property latitude.
Longitude : Property longitude.
Comments : Comments by a building owner or agent to provide context to the building’s energy use.
TotalGHGEmissions : The total amount of greenhouse gas emissions, including carbon dioxide, methane, and nitrous oxide gases released into the atmosphere as a result of energy consumption at the property, measured in metric tons of carbon dioxide equivalent. This calculation uses a GHG emissions factor from Seattle CIty Light's portfolio of generating resources. This uses Seattle City Light's 2015 emissions factor of 52.44 lbs CO2e/MWh until the 2016 factor is available. Enwave steam factor = 170.17 lbs CO2e/MMBtu. Gas factor sourced from EPA Portfolio Manager = 53.11 kg CO2e/MBtu.
GHGEmissionsIntensity : Total Greenhouse Gas Emissions divided by property's gross floor area, measured in kilograms of carbon dioxide equivalent per square foot. This calculation uses a GHG emissions factor from Seattle City Light's portfolio of generating resources
In [8]:
Copied!
# examples of Location
print(df15.loc[0, 'Location'])
print(df16.loc[0, ['Address', 'City', 'State', 'ZipCode', 'Latitude', 'Longitude']])
# df15 has location in one single row
# whereas df1516 has one column for each data
# let's split !
# examples of Location
print(df15.loc[0, 'Location'])
print(df16.loc[0, ['Address', 'City', 'State', 'ZipCode', 'Latitude', 'Longitude']])
# df15 has location in one single row
# whereas df1516 has one column for each data
# let's split !
{'latitude': '47.61219025', 'longitude': '-122.33799744', 'human_address': '{"address": "405 OLIVE WAY", "city": "SEATTLE", "state": "WA", "zip": "98101"}'}
Address 405 Olive way
City Seattle
State WA
ZipCode 98101.0
Latitude 47.6122
Longitude -122.33799
Name: 0, dtype: object
In [9]:
Copied!
import ast
# split Location column
for index, row in df15.iterrows():
l = ast.literal_eval(df15.loc[index, 'Location'])
addr = ast.literal_eval(l['human_address'])
df15.loc[index, 'Latitude'] = l['latitude']
df15.loc[index, 'Longitude'] = l['longitude']
df15.loc[index, 'Address'] = addr['address']
df15.loc[index, 'City'] = addr['city']
df15.loc[index, 'State'] = addr['state']
df15.loc[index, 'ZipCode'] = addr['zip']
# convert to correct type as ast convert to str
df15=df15.astype({'Latitude' : 'float64', 'Longitude' : 'float64', 'ZipCode' : 'float64'})
import ast
# split Location column
for index, row in df15.iterrows():
l = ast.literal_eval(df15.loc[index, 'Location'])
addr = ast.literal_eval(l['human_address'])
df15.loc[index, 'Latitude'] = l['latitude']
df15.loc[index, 'Longitude'] = l['longitude']
df15.loc[index, 'Address'] = addr['address']
df15.loc[index, 'City'] = addr['city']
df15.loc[index, 'State'] = addr['state']
df15.loc[index, 'ZipCode'] = addr['zip']
# convert to correct type as ast convert to str
df15=df15.astype({'Latitude' : 'float64', 'Longitude' : 'float64', 'ZipCode' : 'float64'})
In [10]:
Copied!
print(df16['TotalGHGEmissions'].describe())
print(df15['GHGEmissions(MetricTonsCO2e)'].describe())
print(df16['TotalGHGEmissions'].describe())
print(df15['GHGEmissions(MetricTonsCO2e)'].describe())
count 3367.000000 mean 119.723971 std 538.832227 min -0.800000 25% 9.495000 50% 33.920000 75% 93.940000 max 16870.980000 Name: TotalGHGEmissions, dtype: float64 count 3330.000000 mean 110.094102 std 409.450179 min 0.000000 25% 9.265000 50% 32.740000 75% 88.642500 max 11824.890000 Name: GHGEmissions(MetricTonsCO2e), dtype: float64
In [11]:
Copied!
print(df16['GHGEmissionsIntensity'].describe())
print(df15['GHGEmissionsIntensity(kgCO2e/ft2)'].describe())
print(df16['GHGEmissionsIntensity'].describe())
print(df15['GHGEmissionsIntensity(kgCO2e/ft2)'].describe())
count 3367.000000 mean 1.175916 std 1.821452 min -0.020000 25% 0.210000 50% 0.610000 75% 1.370000 max 34.090000 Name: GHGEmissionsIntensity, dtype: float64 count 3330.000000 mean 0.985339 std 1.637172 min 0.000000 25% 0.080000 50% 0.460000 75% 1.180000 max 31.380000 Name: GHGEmissionsIntensity(kgCO2e/ft2), dtype: float64
In [12]:
Copied!
# columns from df15 and df16 are on the same scale, so they can be merged as is in the resulting dataset
# copy identical columns then will drop all
df15['TotalGHGEmissions'] = df15['GHGEmissions(MetricTonsCO2e)']
df15['GHGEmissionsIntensity'] = df15['GHGEmissionsIntensity(kgCO2e/ft2)']
df15.drop(extra15, axis=1, inplace=True)
df16.drop('Comments', axis=1, inplace=True)
# columns from df15 and df16 are on the same scale, so they can be merged as is in the resulting dataset
# copy identical columns then will drop all
df15['TotalGHGEmissions'] = df15['GHGEmissions(MetricTonsCO2e)']
df15['GHGEmissionsIntensity'] = df15['GHGEmissionsIntensity(kgCO2e/ft2)']
df15.drop(extra15, axis=1, inplace=True)
df16.drop('Comments', axis=1, inplace=True)
In [13]:
Copied!
# merge common columns
df = pd.merge(df15, df16, how='outer')
dfbak = df.copy()
print(df15.shape, df16.shape, df.shape)
# merge common columns
df = pd.merge(df15, df16, how='outer')
dfbak = df.copy()
print(df15.shape, df16.shape, df.shape)
(3340, 45) (3376, 45) (6716, 45)
Exploratory Data Analysis¶
In [14]:
Copied!
df.head()
df.head()
Out[14]:
| OSEBuildingID | DataYear | BuildingType | PrimaryPropertyType | PropertyName | TaxParcelIdentificationNumber | CouncilDistrictCode | Neighborhood | YearBuilt | NumberofBuildings | ... | ComplianceStatus | Outlier | Latitude | Longitude | Address | City | State | ZipCode | TotalGHGEmissions | GHGEmissionsIntensity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2015 | NonResidential | Hotel | MAYFLOWER PARK HOTEL | 659000030 | 7 | DOWNTOWN | 1927 | 1.0 | ... | Compliant | NaN | 47.612190 | -122.337997 | 405 OLIVE WAY | SEATTLE | WA | 98101.0 | 249.43 | 2.64 |
| 1 | 2 | 2015 | NonResidential | Hotel | PARAMOUNT HOTEL | 659000220 | 7 | DOWNTOWN | 1996 | 1.0 | ... | Compliant | NaN | 47.613106 | -122.333358 | 724 PINE ST | SEATTLE | WA | 98101.0 | 263.51 | 2.38 |
| 2 | 3 | 2015 | NonResidential | Hotel | WESTIN HOTEL | 659000475 | 7 | DOWNTOWN | 1969 | 1.0 | ... | Compliant | NaN | 47.613349 | -122.337699 | 1900 5TH AVE | SEATTLE | WA | 98101.0 | 2061.48 | 1.92 |
| 3 | 5 | 2015 | NonResidential | Hotel | HOTEL MAX | 659000640 | 7 | DOWNTOWN | 1926 | 1.0 | ... | Compliant | High Outlier | 47.614216 | -122.336609 | 620 STEWART ST | SEATTLE | WA | 98101.0 | 1936.34 | 31.38 |
| 4 | 8 | 2015 | NonResidential | Hotel | WARWICK SEATTLE HOTEL | 659000970 | 7 | DOWNTOWN | 1980 | 1.0 | ... | Compliant | NaN | 47.613754 | -122.340924 | 401 LENORA ST | SEATTLE | WA | 98121.0 | 507.70 | 4.02 |
5 rows × 45 columns
In [15]:
Copied!
df.info()
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 6716 entries, 0 to 6715 Data columns (total 45 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 OSEBuildingID 6716 non-null int64 1 DataYear 6716 non-null int64 2 BuildingType 6716 non-null object 3 PrimaryPropertyType 6716 non-null object 4 PropertyName 6716 non-null object 5 TaxParcelIdentificationNumber 6714 non-null object 6 CouncilDistrictCode 6716 non-null int64 7 Neighborhood 6716 non-null object 8 YearBuilt 6716 non-null int64 9 NumberofBuildings 6708 non-null float64 10 NumberofFloors 6708 non-null float64 11 PropertyGFATotal 6716 non-null int64 12 PropertyGFAParking 6716 non-null int64 13 PropertyGFABuilding(s) 6716 non-null int64 14 ListOfAllPropertyUseTypes 6580 non-null object 15 LargestPropertyUseType 6560 non-null object 16 LargestPropertyUseTypeGFA 6560 non-null float64 17 SecondLargestPropertyUseType 3238 non-null object 18 SecondLargestPropertyUseTypeGFA 3238 non-null float64 19 ThirdLargestPropertyUseType 1156 non-null object 20 ThirdLargestPropertyUseTypeGFA 1156 non-null float64 21 YearsENERGYSTARCertified 229 non-null object 22 ENERGYSTARScore 5093 non-null float64 23 SiteEUI(kBtu/sf) 6699 non-null float64 24 SiteEUIWN(kBtu/sf) 6700 non-null float64 25 SourceEUI(kBtu/sf) 6697 non-null float64 26 SourceEUIWN(kBtu/sf) 6697 non-null float64 27 SiteEnergyUse(kBtu) 6701 non-null float64 28 SiteEnergyUseWN(kBtu) 6700 non-null float64 29 SteamUse(kBtu) 6697 non-null float64 30 Electricity(kWh) 6697 non-null float64 31 Electricity(kBtu) 6697 non-null float64 32 NaturalGas(therms) 6697 non-null float64 33 NaturalGas(kBtu) 6697 non-null float64 34 DefaultData 6715 non-null object 35 ComplianceStatus 6716 non-null object 36 Outlier 116 non-null object 37 Latitude 6716 non-null float64 38 Longitude 6716 non-null float64 39 Address 6716 non-null object 40 City 6716 non-null object 41 State 6716 non-null object 42 ZipCode 6700 non-null float64 43 TotalGHGEmissions 6697 non-null float64 44 GHGEmissionsIntensity 6697 non-null float64 dtypes: float64(22), int64(7), object(16) memory usage: 2.4+ MB
Location of Buildings¶
In [16]:
Copied!
# Just for fun, display the points on a map
import folium
from folium.plugins import MarkerCluster
m = folium.Map(location=[df.loc[0, 'Latitude'], df.loc[0, 'Longitude']], zoom_start=13)
marker_cluster = MarkerCluster().add_to(m)
for index, row in df.iterrows():
folium.Marker([row['Latitude'], row['Longitude']]).add_to(marker_cluster)
m
# Just for fun, display the points on a map
import folium
from folium.plugins import MarkerCluster
m = folium.Map(location=[df.loc[0, 'Latitude'], df.loc[0, 'Longitude']], zoom_start=13)
marker_cluster = MarkerCluster().add_to(m)
for index, row in df.iterrows():
folium.Marker([row['Latitude'], row['Longitude']]).add_to(marker_cluster)
m
Out[16]:
Make this Notebook Trusted to load map: File -> Trust Notebook
Data Cleaning¶
In [17]:
Copied!
df.dtypes.value_counts().plot.pie()
df.dtypes.value_counts().plot.pie()
Out[17]:
<AxesSubplot:ylabel='None'>
Empty Features analysis¶
In [18]:
Copied!
import missingno as msno
msno.matrix(df)
import missingno as msno
msno.matrix(df)
Out[18]:
<AxesSubplot:>
In [19]:
Copied!
# show columns with (too much) empty values
df.isna().mean().sort_values().plot(kind="hist")
# show columns with (too much) empty values
df.isna().mean().sort_values().plot(kind="hist")
Out[19]:
<AxesSubplot:ylabel='Frequency'>
In [20]:
Copied!
# removing columns that have more than 50% of empty values
threshold = 0.5
for col in df.columns:
if df[col].isna().mean() > threshold:
print('removing empty column: ', col)
df.drop(col, axis=1, inplace=True)
# removing columns that have more than 50% of empty values
threshold = 0.5
for col in df.columns:
if df[col].isna().mean() > threshold:
print('removing empty column: ', col)
df.drop(col, axis=1, inplace=True)
removing empty column: SecondLargestPropertyUseType removing empty column: SecondLargestPropertyUseTypeGFA removing empty column: ThirdLargestPropertyUseType removing empty column: ThirdLargestPropertyUseTypeGFA removing empty column: YearsENERGYSTARCertified removing empty column: Outlier
Outliers Analysis¶
In [21]:
Copied!
def dropOutlierIQR(df, col):
Q1 = df[col].quantile(0.25)
# print(Q1)
Q3 = df[col].quantile(0.75)
# print(Q3)
IQR = Q3 - Q1
# print(IQR)
df[col] = df[col][df[col].between((Q1 - 1.5 * IQR), (Q3 + 1.5 * IQR))]
def dropOutlierIQR(df, col):
Q1 = df[col].quantile(0.25)
# print(Q1)
Q3 = df[col].quantile(0.75)
# print(Q3)
IQR = Q3 - Q1
# print(IQR)
df[col] = df[col][df[col].between((Q1 - 1.5 * IQR), (Q3 + 1.5 * IQR))]
In [22]:
Copied!
for col in df.select_dtypes(include=['float', 'int']).columns:
dropOutlierIQR(df, col)
for col in df.select_dtypes(include=['float', 'int']).columns:
dropOutlierIQR(df, col)
Non sense Values¶
In [23]:
Copied!
df.select_dtypes(exclude='object').lt(0).sum()
df.select_dtypes(exclude='object').lt(0).sum()
Out[23]:
OSEBuildingID 0 DataYear 0 CouncilDistrictCode 0 YearBuilt 0 NumberofBuildings 0 NumberofFloors 0 PropertyGFATotal 0 PropertyGFAParking 0 PropertyGFABuilding(s) 2 LargestPropertyUseTypeGFA 0 ENERGYSTARScore 0 SiteEUI(kBtu/sf) 0 SiteEUIWN(kBtu/sf) 0 SourceEUI(kBtu/sf) 1 SourceEUIWN(kBtu/sf) 2 SiteEnergyUse(kBtu) 0 SiteEnergyUseWN(kBtu) 0 SteamUse(kBtu) 0 Electricity(kWh) 1 Electricity(kBtu) 1 NaturalGas(therms) 0 NaturalGas(kBtu) 0 Latitude 0 Longitude 6522 ZipCode 0 TotalGHGEmissions 1 GHGEmissionsIntensity 1 dtype: int64
In [24]:
Copied!
# remove rows for features that shouldn't be negative
nonNegativeFeatures = ['PropertyGFABuilding(s)',
'SourceEUI(kBtu/sf)',
'SourceEUIWN(kBtu/sf)',
'Electricity(kWh)',
'Electricity(kBtu)',
'TotalGHGEmissions',
'GHGEmissionsIntensity']
for col in nonNegativeFeatures:
df.drop(df.loc[df[col] < 0].index, inplace=True)
df.shape
# remove rows for features that shouldn't be negative
nonNegativeFeatures = ['PropertyGFABuilding(s)',
'SourceEUI(kBtu/sf)',
'SourceEUIWN(kBtu/sf)',
'Electricity(kWh)',
'Electricity(kBtu)',
'TotalGHGEmissions',
'GHGEmissionsIntensity']
for col in nonNegativeFeatures:
df.drop(df.loc[df[col] < 0].index, inplace=True)
df.shape
Out[24]:
(6712, 39)
In [25]:
Copied!
df.select_dtypes(exclude='object').eq(0).sum()
df.select_dtypes(exclude='object').eq(0).sum()
Out[25]:
OSEBuildingID 0 DataYear 0 CouncilDistrictCode 0 YearBuilt 0 NumberofBuildings 0 NumberofFloors 21 PropertyGFATotal 0 PropertyGFAParking 5286 PropertyGFABuilding(s) 0 LargestPropertyUseTypeGFA 0 ENERGYSTARScore 0 SiteEUI(kBtu/sf) 18 SiteEUIWN(kBtu/sf) 41 SourceEUI(kBtu/sf) 26 SourceEUIWN(kBtu/sf) 48 SiteEnergyUse(kBtu) 20 SiteEnergyUseWN(kBtu) 41 SteamUse(kBtu) 6433 Electricity(kWh) 16 Electricity(kBtu) 16 NaturalGas(therms) 2510 NaturalGas(kBtu) 2510 Latitude 0 Longitude 0 ZipCode 0 TotalGHGEmissions 11 GHGEmissionsIntensity 20 dtype: int64
In [26]:
Copied!
# remove rows for features that shouldn't be equal to zero
strictPositiveFeatures = ['NumberofFloors',
'SourceEUI(kBtu/sf)',
'SourceEUIWN(kBtu/sf)',
'SourceEUI(kBtu/sf)',
'SourceEUIWN(kBtu/sf)',
'SiteEnergyUse(kBtu)',
'SiteEnergyUseWN(kBtu)',
'Electricity(kWh)',
'Electricity(kBtu)',
'TotalGHGEmissions',
'GHGEmissionsIntensity']
for col in strictPositiveFeatures:
df.drop(df.loc[df[col] == 0].index, inplace=True)
df.shape
# remove rows for features that shouldn't be equal to zero
strictPositiveFeatures = ['NumberofFloors',
'SourceEUI(kBtu/sf)',
'SourceEUIWN(kBtu/sf)',
'SourceEUI(kBtu/sf)',
'SourceEUIWN(kBtu/sf)',
'SiteEnergyUse(kBtu)',
'SiteEnergyUseWN(kBtu)',
'Electricity(kWh)',
'Electricity(kBtu)',
'TotalGHGEmissions',
'GHGEmissionsIntensity']
for col in strictPositiveFeatures:
df.drop(df.loc[df[col] == 0].index, inplace=True)
df.shape
Out[26]:
(6634, 39)
In [27]:
Copied!
# Remove rows where default data has been set
df.drop(df.loc[df['DefaultData'] == True].index, inplace=True)
# Remove rows where default data has been set
df.drop(df.loc[df['DefaultData'] == True].index, inplace=True)
Feature Engineering¶
Building Age¶
In [28]:
Copied!
# feature engineering : we create an age od the building, more convenient than year of build
df['BuildingAge'] = df.DataYear - df.YearBuilt
# feature engineering : we create an age od the building, more convenient than year of build
df['BuildingAge'] = df.DataYear - df.YearBuilt
Location¶
In [29]:
Copied!
from geopy.distance import distance
df.dropna(subset=['Latitude', 'Longitude'], inplace=True)
# official position of Seattle (https://www.latlong.net/place/seattle-wa-usa-2655.html)
d0 = (47.608013, -122.335167)
points = zip(df.Latitude, df.Longitude)
# for p in points:
# print(p)
df['Distance'] = [distance(d0, point).miles for point in zip(df.Latitude, df.Longitude)]
df.drop(labels=['Latitude', 'Longitude'], axis=1, inplace=True)
from geopy.distance import distance
df.dropna(subset=['Latitude', 'Longitude'], inplace=True)
# official position of Seattle (https://www.latlong.net/place/seattle-wa-usa-2655.html)
d0 = (47.608013, -122.335167)
points = zip(df.Latitude, df.Longitude)
# for p in points:
# print(p)
df['Distance'] = [distance(d0, point).miles for point in zip(df.Latitude, df.Longitude)]
df.drop(labels=['Latitude', 'Longitude'], axis=1, inplace=True)
Features Cleanup¶
In [30]:
Copied!
# don't need this feature, since duplicates the one with kBtu
df.drop('Electricity(kWh)', axis=1, inplace=True)
# also, we don't need the features 'WN', they are highly correlated with their siblings 'without WN'
colsWN = []
for col in df.columns:
if 'WN' in col:
colsWN.append(col)
df.drop(colsWN, axis=1, inplace=True)
# lastly, we can remove NaturalGas(therms), since it the same than naturalGas(kBtu), only the unit change
df.drop('NaturalGas(therms)', axis=1, inplace=True)
# State has unique value
df.drop('State', axis=1, inplace=True)
# except BuildingAge, none of the other features are very interesting, we could drop them
df.drop(['YearBuilt', 'CouncilDistrictCode', 'DataYear'], axis=1, inplace=True)
# some features are not interesting, we could drop them
df.drop(['ZipCode', 'OSEBuildingID', 'SteamUse(kBtu)', 'PropertyGFAParking', 'NumberofBuildings', 'TaxParcelIdentificationNumber'], axis=1, inplace=True)
# remove Default Data feature
df.drop('DefaultData', axis=1, inplace=True)
# Some features are not interesting, we can remove them
df.drop(['PropertyName', 'Address', 'City'], axis=1, inplace=True)
# don't need this feature, since duplicates the one with kBtu
df.drop('Electricity(kWh)', axis=1, inplace=True)
# also, we don't need the features 'WN', they are highly correlated with their siblings 'without WN'
colsWN = []
for col in df.columns:
if 'WN' in col:
colsWN.append(col)
df.drop(colsWN, axis=1, inplace=True)
# lastly, we can remove NaturalGas(therms), since it the same than naturalGas(kBtu), only the unit change
df.drop('NaturalGas(therms)', axis=1, inplace=True)
# State has unique value
df.drop('State', axis=1, inplace=True)
# except BuildingAge, none of the other features are very interesting, we could drop them
df.drop(['YearBuilt', 'CouncilDistrictCode', 'DataYear'], axis=1, inplace=True)
# some features are not interesting, we could drop them
df.drop(['ZipCode', 'OSEBuildingID', 'SteamUse(kBtu)', 'PropertyGFAParking', 'NumberofBuildings', 'TaxParcelIdentificationNumber'], axis=1, inplace=True)
# remove Default Data feature
df.drop('DefaultData', axis=1, inplace=True)
# Some features are not interesting, we can remove them
df.drop(['PropertyName', 'Address', 'City'], axis=1, inplace=True)
In [31]:
Copied!
print(df.ListOfAllPropertyUseTypes.describe())
df.ListOfAllPropertyUseTypes.head()
# this feature is a concatenation of several values, seperated by ', '
# we could try to seperate them, but in fact this feature is not very intersting, since there is feature 'LargestPropertyUseType', that we'll use instead
df.drop('ListOfAllPropertyUseTypes', axis=1, inplace=True)
print(df.ListOfAllPropertyUseTypes.describe())
df.ListOfAllPropertyUseTypes.head()
# this feature is a concatenation of several values, seperated by ', '
# we could try to seperate them, but in fact this feature is not very intersting, since there is feature 'LargestPropertyUseType', that we'll use instead
df.drop('ListOfAllPropertyUseTypes', axis=1, inplace=True)
count 6190 unique 477 top Multifamily Housing freq 1616 Name: ListOfAllPropertyUseTypes, dtype: object
In [32]:
Copied!
df.ComplianceStatus.value_counts().plot.barh()
# most of the values are compliant, we could remove the ones which are not (they may be misleading our predictions since the energy consumption is not accurate)
df = df[df.ComplianceStatus == 'Compliant']
df.drop(['ComplianceStatus'], axis=1, inplace=True)
df.ComplianceStatus.value_counts().plot.barh()
# most of the values are compliant, we could remove the ones which are not (they may be misleading our predictions since the energy consumption is not accurate)
df = df[df.ComplianceStatus == 'Compliant']
df.drop(['ComplianceStatus'], axis=1, inplace=True)
Remove & Fill NaN¶
In [33]:
Copied!
# replace or fill NaN values for some features
# example for NumberofFloors --> 0 by default. we consider a building has at least one floor (groundfloor UK ), it's a value that exists and thus is valid
for col in df.columns:
print(col, df[col].isna().sum())
values = {
'NumberofFloors' : 0,
'SiteEUI(kBtu/sf)' : 0,
'SourceEUI(kBtu/sf)' : 0,
'SiteEnergyUse(kBtu)' : 0,
}
df.fillna(value=values, inplace=True)
# drop nan after
# nadrop = ['TotalGHGEmissions', 'NaturalGas(kBtu)', 'Electricity(kBtu)' ]
nadrop = ['TotalGHGEmissions', 'PropertyGFATotal' ]
df.dropna(subset=nadrop, inplace=True)
# replace or fill NaN values for some features
# example for NumberofFloors --> 0 by default. we consider a building has at least one floor (groundfloor UK ), it's a value that exists and thus is valid
for col in df.columns:
print(col, df[col].isna().sum())
values = {
'NumberofFloors' : 0,
'SiteEUI(kBtu/sf)' : 0,
'SourceEUI(kBtu/sf)' : 0,
'SiteEnergyUse(kBtu)' : 0,
}
df.fillna(value=values, inplace=True)
# drop nan after
# nadrop = ['TotalGHGEmissions', 'NaturalGas(kBtu)', 'Electricity(kBtu)' ]
nadrop = ['TotalGHGEmissions', 'PropertyGFATotal' ]
df.dropna(subset=nadrop, inplace=True)
BuildingType 0 PrimaryPropertyType 0 Neighborhood 0 NumberofFloors 465 PropertyGFATotal 695 PropertyGFABuilding(s) 645 LargestPropertyUseType 141 LargestPropertyUseTypeGFA 796 ENERGYSTARScore 1530 SiteEUI(kBtu/sf) 496 SourceEUI(kBtu/sf) 583 SiteEnergyUse(kBtu) 735 Electricity(kBtu) 753 NaturalGas(kBtu) 642 TotalGHGEmissions 685 GHGEmissionsIntensity 525 BuildingAge 0 Distance 0
In [34]:
Copied!
df.columns
df.columns
Out[34]:
Index(['BuildingType', 'PrimaryPropertyType', 'Neighborhood', 'NumberofFloors',
'PropertyGFATotal', 'PropertyGFABuilding(s)', 'LargestPropertyUseType',
'LargestPropertyUseTypeGFA', 'ENERGYSTARScore', 'SiteEUI(kBtu/sf)',
'SourceEUI(kBtu/sf)', 'SiteEnergyUse(kBtu)', 'Electricity(kBtu)',
'NaturalGas(kBtu)', 'TotalGHGEmissions', 'GHGEmissionsIntensity',
'BuildingAge', 'Distance'],
dtype='object')
In [35]:
Copied!
df.shape
df.shape
Out[35]:
(5258, 18)
In [36]:
Copied!
dfbak = df.copy()
df.to_pickle("./tmp/df_cleaned.gzip")
dfbak = df.copy()
df.to_pickle("./tmp/df_cleaned.gzip")
In [37]:
Copied!
df['NaturalGas(kBtu)'].describe()
df['NaturalGas(kBtu)'].describe()
Out[37]:
count 5.159000e+03 mean 4.860971e+05 std 6.517949e+05 min 0.000000e+00 25% 0.000000e+00 50% 1.904470e+05 75% 7.796090e+05 max 2.873052e+06 Name: NaturalGas(kBtu), dtype: float64
Features Analysis¶
Features Correlations Analysis¶
In [38]:
Copied!
# features correlations
def show_feat_correlations(df):
correlations = np.abs(df.select_dtypes('float').corr(method="pearson"))
mask = np.zeros_like(correlations)
mask[np.triu_indices_from(mask)] = True
sns.axes_style("white")
plt.subplots(figsize=(15,15))
sns.heatmap(correlations, mask=mask, vmax=1, square=True, xticklabels=correlations.columns, yticklabels=correlations.columns, annot=True, cmap='cividis')
# features correlations
def show_feat_correlations(df):
correlations = np.abs(df.select_dtypes('float').corr(method="pearson"))
mask = np.zeros_like(correlations)
mask[np.triu_indices_from(mask)] = True
sns.axes_style("white")
plt.subplots(figsize=(15,15))
sns.heatmap(correlations, mask=mask, vmax=1, square=True, xticklabels=correlations.columns, yticklabels=correlations.columns, annot=True, cmap='cividis')
In [39]:
Copied!
show_feat_correlations(df)
show_feat_correlations(df)
In [40]:
Copied!
# focus on these features
sns.scatterplot(x='LargestPropertyUseTypeGFA', y='PropertyGFATotal', data=df)
# these two features are highly correlated, so one can/should be removed
# focus on these features
sns.scatterplot(x='LargestPropertyUseTypeGFA', y='PropertyGFATotal', data=df)
# these two features are highly correlated, so one can/should be removed
Out[40]:
<AxesSubplot:xlabel='LargestPropertyUseTypeGFA', ylabel='PropertyGFATotal'>
In [41]:
Copied!
# some features are higly correlated (>0.7), thus we should remove them, otherwise for our further models, we will have data leakage.
# 'PropertyGFABuilding(s)' : can be overcomed by 'PropertyGFATotal'
df.drop(labels=['PropertyGFABuilding(s)', 'SiteEUI(kBtu/sf)'], axis=1, inplace=True)
# df.drop(labels=['LargestPropertyUseType'], axis=1, inplace=True)
df.drop(labels=['LargestPropertyUseTypeGFA'], axis=1, inplace=True)
# df.drop(labels=['SiteEnergyUse(kBtu)'], axis=1, inplace=True)
# some features are higly correlated (>0.7), thus we should remove them, otherwise for our further models, we will have data leakage.
# 'PropertyGFABuilding(s)' : can be overcomed by 'PropertyGFATotal'
df.drop(labels=['PropertyGFABuilding(s)', 'SiteEUI(kBtu/sf)'], axis=1, inplace=True)
# df.drop(labels=['LargestPropertyUseType'], axis=1, inplace=True)
df.drop(labels=['LargestPropertyUseTypeGFA'], axis=1, inplace=True)
# df.drop(labels=['SiteEnergyUse(kBtu)'], axis=1, inplace=True)
In [42]:
Copied!
show_feat_correlations(df)
show_feat_correlations(df)
In [43]:
Copied!
df.to_pickle("./tmp/df_cleaned.gzip")
df.to_pickle("./tmp/df_cleaned.gzip")
Objects type features¶
In [44]:
Copied!
df.select_dtypes('object').nunique()
df.select_dtypes('object').nunique()
Out[44]:
BuildingType 8 PrimaryPropertyType 31 Neighborhood 19 LargestPropertyUseType 52 dtype: int64
In [45]:
Copied!
plt.rcParams.update({'font.size': 8})
feats = df.select_dtypes('object').columns
nrows = int(feats.shape[0])
ncols = 1
fig, ax = plt.subplots(nrows, ncols, sharex=False, sharey=False, figsize=(10,30))
for i, col in enumerate(feats):
# print(col)
# print(df[col].describe())
# df[col].value_counts().plot(kind='barh', ax=ax[i], xlabel=col)
sns.countplot(data=df, y=col, ax=ax[i])
plt.rcParams.update({'font.size': 8})
feats = df.select_dtypes('object').columns
nrows = int(feats.shape[0])
ncols = 1
fig, ax = plt.subplots(nrows, ncols, sharex=False, sharey=False, figsize=(10,30))
for i, col in enumerate(feats):
# print(col)
# print(df[col].describe())
# df[col].value_counts().plot(kind='barh', ax=ax[i], xlabel=col)
sns.countplot(data=df, y=col, ax=ax[i])
Numerical Features¶
In [46]:
Copied!
df.hist(figsize=(15,15), bins=100)
df.hist(figsize=(15,15), bins=100)
Out[46]:
array([[<AxesSubplot:title={'center':'NumberofFloors'}>,
<AxesSubplot:title={'center':'PropertyGFATotal'}>,
<AxesSubplot:title={'center':'ENERGYSTARScore'}>],
[<AxesSubplot:title={'center':'SourceEUI(kBtu/sf)'}>,
<AxesSubplot:title={'center':'SiteEnergyUse(kBtu)'}>,
<AxesSubplot:title={'center':'Electricity(kBtu)'}>],
[<AxesSubplot:title={'center':'NaturalGas(kBtu)'}>,
<AxesSubplot:title={'center':'TotalGHGEmissions'}>,
<AxesSubplot:title={'center':'GHGEmissionsIntensity'}>],
[<AxesSubplot:title={'center':'BuildingAge'}>,
<AxesSubplot:title={'center':'Distance'}>, <AxesSubplot:>]],
dtype=object)
Distributions¶
In [47]:
Copied!
plt.rcParams.update({'font.size': 8})
plt.figure(figsize=(15,8))
df.select_dtypes('float').boxplot(vert=False)
plt.xscale('log')
plt.rcParams.update({'font.size': 8})
plt.figure(figsize=(15,8))
df.select_dtypes('float').boxplot(vert=False)
plt.xscale('log')
Univariate Analysis¶
In [48]:
Copied!
# TotalGHGEmissions & GHGEmissionsIntensity specific analysis
fig, ax = plt.subplots(nrows=1, ncols=2, sharex=False, sharey=False, figsize=(15,5))
sns.distplot(df.TotalGHGEmissions, ax=ax[0])
sns.distplot(df.GHGEmissionsIntensity, ax=ax[1])
# TotalGHGEmissions & GHGEmissionsIntensity specific analysis
fig, ax = plt.subplots(nrows=1, ncols=2, sharex=False, sharey=False, figsize=(15,5))
sns.distplot(df.TotalGHGEmissions, ax=ax[0])
sns.distplot(df.GHGEmissionsIntensity, ax=ax[1])
Out[48]:
<AxesSubplot:xlabel='GHGEmissionsIntensity', ylabel='Density'>
Multivariate Analysis¶
In [49]:
Copied!
plt.rcParams.update({'font.size': 10})
fig, ax = plt.subplots(nrows=2, ncols=2, sharex=False, sharey=False, figsize=(15,15))
sns.scatterplot(x='NaturalGas(kBtu)', y='Electricity(kBtu)', data=df ,ax=ax[0,0])
sns.scatterplot(x='SiteEnergyUse(kBtu)', y='TotalGHGEmissions', data=df, ax=ax[0,1])
sns.scatterplot(x='Electricity(kBtu)', y='TotalGHGEmissions', data=df, ax=ax[1,0])
sns.scatterplot(x='NaturalGas(kBtu)', y='TotalGHGEmissions', data=df, ax=ax[1,1])
plt.rcParams.update({'font.size': 10})
fig, ax = plt.subplots(nrows=2, ncols=2, sharex=False, sharey=False, figsize=(15,15))
sns.scatterplot(x='NaturalGas(kBtu)', y='Electricity(kBtu)', data=df ,ax=ax[0,0])
sns.scatterplot(x='SiteEnergyUse(kBtu)', y='TotalGHGEmissions', data=df, ax=ax[0,1])
sns.scatterplot(x='Electricity(kBtu)', y='TotalGHGEmissions', data=df, ax=ax[1,0])
sns.scatterplot(x='NaturalGas(kBtu)', y='TotalGHGEmissions', data=df, ax=ax[1,1])
Out[49]:
<AxesSubplot:xlabel='NaturalGas(kBtu)', ylabel='TotalGHGEmissions'>
In [50]:
Copied!
# note : uncomment to allow interactive drawing, according to runtime environment
# %matplotlib notebook
# %matplotlib widget
# %matplotlib inline
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(projection='3d')
ax.scatter(df['NaturalGas(kBtu)'], df['Electricity(kBtu)'], df['TotalGHGEmissions'], color='green')
ax.set_title("3D plot")
ax.set_xlabel('NaturalGas(kBtu)')
ax.set_ylabel('Electricity(kBtu)')
ax.set_zlabel('TotalGHGEmissions')
plt.show()
# plt.imshow(fig)
# note : uncomment to allow interactive drawing, according to runtime environment
# %matplotlib notebook
# %matplotlib widget
# %matplotlib inline
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(projection='3d')
ax.scatter(df['NaturalGas(kBtu)'], df['Electricity(kBtu)'], df['TotalGHGEmissions'], color='green')
ax.set_title("3D plot")
ax.set_xlabel('NaturalGas(kBtu)')
ax.set_ylabel('Electricity(kBtu)')
ax.set_zlabel('TotalGHGEmissions')
plt.show()
# plt.imshow(fig)
In [51]:
Copied!
# %matplotlib inline
# %matplotlib inline
Energy Ratios¶
In [52]:
Copied!
# we don't want to rely on the consumption data, because this is costly to get. Our model must do without it
# furthermore, these features are higly correlated, so one can be removed
# but we can keep the proportion of each one, to know for example what is the main energy source of a building
# df['ElectricityRate'] = df['Electricity(kBtu)'] / (df['Electricity(kBtu)'] + df['NaturalGas(kBtu)']+1)
df['NaturalGasRate'] = df['NaturalGas(kBtu)'] / (df['Electricity(kBtu)'] + df['NaturalGas(kBtu)']+1)
df.drop(['Electricity(kBtu)', 'NaturalGas(kBtu)'], axis=1, inplace=True)
# we don't want to rely on the consumption data, because this is costly to get. Our model must do without it
# furthermore, these features are higly correlated, so one can be removed
# but we can keep the proportion of each one, to know for example what is the main energy source of a building
# df['ElectricityRate'] = df['Electricity(kBtu)'] / (df['Electricity(kBtu)'] + df['NaturalGas(kBtu)']+1)
df['NaturalGasRate'] = df['NaturalGas(kBtu)'] / (df['Electricity(kBtu)'] + df['NaturalGas(kBtu)']+1)
df.drop(['Electricity(kBtu)', 'NaturalGas(kBtu)'], axis=1, inplace=True)
In [53]:
Copied!
sns.catplot(x='NaturalGasRate', y='Neighborhood', data=df,kind="violin", width=2, height=15, aspect=1.5)
sns.catplot(x='NaturalGasRate', y='Neighborhood', data=df,kind="violin", width=2, height=15, aspect=1.5)
Out[53]:
<seaborn.axisgrid.FacetGrid at 0x7fcb5c467d60>
In [54]:
Copied!
# fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(30,10))
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(30,10))
sns.boxplot(x='TotalGHGEmissions', y='Neighborhood', data=df, ax=ax)
# sns.boxplot(x='TotalGHGEmissions', y='LargestPropertyUseType', data=df, ax=ax[1])
# fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(30,10))
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(30,10))
sns.boxplot(x='TotalGHGEmissions', y='Neighborhood', data=df, ax=ax)
# sns.boxplot(x='TotalGHGEmissions', y='LargestPropertyUseType', data=df, ax=ax[1])
Out[54]:
<AxesSubplot:xlabel='TotalGHGEmissions', ylabel='Neighborhood'>
In [55]:
Copied!
fig, ax = plt.subplots(nrows=1, ncols=2, sharex=False, sharey=False, figsize=(16,8))
print(ax.shape)
# plt.figure(figsize=(8,8))
sns.scatterplot(y='TotalGHGEmissions', x='BuildingAge', data=df, ax=ax[0])
ax1 = ax[1]
ax2 = ax1.twinx()
sns.scatterplot(y='TotalGHGEmissions', x='Distance', data=df, ax=ax1, color='red', alpha=0.5)
sns.scatterplot(y='SiteEnergyUse(kBtu)', x='Distance', data=df, ax=ax2, color='blue', alpha=0.5)
fig, ax = plt.subplots(nrows=1, ncols=2, sharex=False, sharey=False, figsize=(16,8))
print(ax.shape)
# plt.figure(figsize=(8,8))
sns.scatterplot(y='TotalGHGEmissions', x='BuildingAge', data=df, ax=ax[0])
ax1 = ax[1]
ax2 = ax1.twinx()
sns.scatterplot(y='TotalGHGEmissions', x='Distance', data=df, ax=ax1, color='red', alpha=0.5)
sns.scatterplot(y='SiteEnergyUse(kBtu)', x='Distance', data=df, ax=ax2, color='blue', alpha=0.5)
(2,)
Out[55]:
<AxesSubplot:xlabel='Distance', ylabel='SiteEnergyUse(kBtu)'>
In [56]:
Copied!
fig, ax = plt.subplots(figsize=(15,10))
sns.scatterplot(data=df[(df.BuildingType == 'Multifamily MR (5-9)') | (df.BuildingType == 'Multifamily LR (1-4)')], x='SiteEnergyUse(kBtu)', y='TotalGHGEmissions', hue='BuildingType', ax=ax)
fig, ax = plt.subplots(figsize=(15,10))
sns.scatterplot(data=df[(df.BuildingType == 'Multifamily MR (5-9)') | (df.BuildingType == 'Multifamily LR (1-4)')], x='SiteEnergyUse(kBtu)', y='TotalGHGEmissions', hue='BuildingType', ax=ax)
Out[56]:
<AxesSubplot:xlabel='SiteEnergyUse(kBtu)', ylabel='TotalGHGEmissions'>
In [57]:
Copied!
df.to_pickle("./tmp/df_cleaned.gzip")
df.to_pickle("./tmp/df_cleaned.gzip")